5 Diagnostics and Analysis
5.1 Investigate high variance wells
nrow.source = function(df, facilityname, sourcename) {
stopifnot(length(sourcename)==1)
return(nrow(df %>% filter(well==facilityname, source==sourcename)))
}
well_summaries = outframe %>%
filter(facility %in% well_names, variable=="mf_pred") %>%
group_by(facility) %>%
summarise(mean = mean(value),
sd = sd(value),
n_test = nrow.source(regression_df, unique(facility),"Well Tests"),
n_pi = nrow.source(regression_df, unique(facility), "PI Database"),
use.test = ifelse(n_test>0, "Test data", "No test data"),
use.pi = ifelse(n_pi>0, "PI data", "No PI data")) %>%
arrange(desc(sd))
well_summaries$production.curve = ifelse(well_summaries$facility %in% liq_wells, "Production curve", "Time series")
fp_summaries = list(fp14 = well_summaries %>% filter(facility %in% flows_to(censor('fp14', 'fp'))),
fp15 = well_summaries %>% filter(facility %in% flows_to(censor('fp15', 'fp'))),
fp16 = well_summaries %>% filter(facility %in% flows_to(censor('fp16', 'fp'))))
for (fp in names(fp_summaries)) {
print(xtable(fp_summaries[[fp]] %>% select(-c(use.test, use.pi, production.curve)),
type = "latex",
caption=paste("Data methods feeding flash plant", censor(fp, 'fp')),
label=paste0("tab:well_summaries_", fp)),
table.placement = "H",
file = paste0("../_media/summaries_", fp, ".tex"))
}
n_summaries = well_summaries %>%
group_by(use.pi, use.test) %>%
count()
sourceplot = ggplot(well_summaries, aes(x=1, y=log(sd))) +
geom_boxplot(fill='steelblue') +
geom_label(data=n_summaries, aes(x=-Inf, y=-Inf, hjust=0, vjust=0, label=paste0("n=", n), family="Times New Roman"), label.size=0, fill='white') +
facet_grid(.~ use.pi + use.test) +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
labs(title="Differences in Production Error by Data Source", x="Production curve data source", y="log(standard deviation)")# +
# ggsave('../_media/error_source.png', width=24.7*0.5, height=6, units='cm')
ggplotly(sourceplot)sourcetab = well_summaries %>% select(facility, mean, sd, n_test, n_pi)
# print(xtable(sourcetab %>% head(), type = "latex",
# caption="Upon inspection of the wells with the most variance, there is no immediate cause for high variance. This requires further investigation.",
# label="tab:well_summaries"),
# table.placement = "h",
# file = "../_media/well_summaries.tex")
kable(cbind(sourcetab)) %>%
kable_styling() %>%
scroll_box(height = "400px")| facility | mean | sd | n_test | n_pi |
|---|---|---|---|---|
| wk86 | 81.5344741 | 61.3421227 | 4 | 0 |
| wk28 | 88.7191863 | 49.1865621 | 26 | 0 |
| wk242 | 321.9122120 | 46.6096937 | 28 | 0 |
| wk65 | 26.7570149 | 42.1490342 | 2 | 0 |
| wk27 | 163.8370066 | 37.6843482 | 36 | 0 |
| wk81 | 60.1607344 | 27.4466959 | 24 | 0 |
| wk116 | 76.0484841 | 26.6558492 | 10 | 0 |
| wk26b | 112.4015979 | 25.6578444 | 13 | 0 |
| wk76 | 124.3262822 | 23.0842652 | 31 | 0 |
| wk123 | 184.1871324 | 22.9574870 | 31 | 0 |
| wk67 | 92.9119044 | 22.1894908 | 26 | 0 |
| wk59 | 86.9360822 | 22.1636942 | 31 | 0 |
| wk96 | 27.8387902 | 20.8344581 | 10 | 0 |
| wk72 | 151.6238128 | 20.6750130 | 26 | 0 |
| wk71 | 127.4461328 | 19.8326786 | 32 | 0 |
| wk256 | 218.9385203 | 18.8662810 | 32 | 30 |
| wk268 | 90.9012237 | 17.4071984 | 27 | 30 |
| wk266 | 326.5706599 | 17.0555332 | 29 | 30 |
| wk264 | 393.5393997 | 16.4000437 | 34 | 30 |
| wk245 | 418.2219228 | 15.5064443 | 41 | 30 |
| wk26a | 119.5009793 | 14.5287864 | 16 | 0 |
| wk265 | 335.9188636 | 14.4117511 | 34 | 30 |
| wk267 | 308.6025505 | 14.0059520 | 31 | 30 |
| wk255 | 377.3065824 | 13.9818617 | 41 | 30 |
| wk83 | 133.4618281 | 13.4622589 | 26 | 0 |
| wk74 | 156.6504648 | 12.7175994 | 26 | 0 |
| wk269 | 133.2196999 | 11.0219560 | 25 | 30 |
| wk235 | 206.0957866 | 10.3933763 | 20 | 0 |
| wk70 | 151.4158667 | 10.0002375 | 45 | 0 |
| wk55 | 65.1317321 | 9.9032863 | 47 | 0 |
| wk262 | 633.1256048 | 9.5574724 | 36 | 30 |
| wk244 | 196.7817051 | 9.4283939 | 37 | 30 |
| wk229 | 201.3411656 | 9.1752904 | 92 | 0 |
| wk124 | 252.5206841 | 8.9259854 | 31 | 0 |
| wk222 | 105.9781842 | 8.4414258 | 44 | 0 |
| wk243 | 375.1448829 | 8.2169925 | 52 | 30 |
| wk247 | 232.9289526 | 6.6020166 | 48 | 30 |
| wk271 | 86.5929859 | 6.5142536 | 6 | 30 |
| wk207 | 150.3506614 | 6.2642318 | 18 | 0 |
| wk250 | 25.9639557 | 5.9004806 | 0 | 30 |
| wk239 | 132.8546240 | 5.8128930 | 18 | 0 |
| wk46 | 43.9515350 | 5.2028818 | 18 | 0 |
| wk260 | 295.9918124 | 5.0412982 | 33 | 30 |
| wk261 | 250.3107907 | 4.6730155 | 37 | 30 |
| wk253 | 269.9234974 | 4.5727659 | 32 | 30 |
| wk270 | 460.2166869 | 4.0465622 | 5 | 30 |
| wk258 | 198.6058922 | 3.2505974 | 38 | 30 |
| wk263 | 221.6051927 | 3.2470226 | 34 | 30 |
| wk259 | 309.0674371 | 3.1877495 | 37 | 30 |
| wk272 | 208.3516971 | 3.0291599 | 7 | 30 |
| wk254 | 226.6098609 | 2.3835320 | 39 | 30 |
| wk605 | 34.4044858 | 1.3023662 | 0 | 0 |
| wk606 | 22.5021979 | 1.0342556 | 0 | 0 |
| wk237 | 14.3441589 | 0.9160045 | 0 | 30 |
| wk233 | 13.6224547 | 0.5723686 | 0 | 30 |
| wk241 | 45.4437549 | 0.4758516 | 0 | 30 |
| wk249 | 1.1520835 | 0.3806202 | 0 | 0 |
| wk238 | 60.2461360 | 0.3380969 | 0 | 30 |
| wk610 | 6.6595603 | 0.2506634 | 0 | 0 |
| wk607 | 1.4406459 | 0.2482484 | 0 | 0 |
| wk234 | 30.9725253 | 0.2376167 | 0 | 30 |
| wk25 | 6.6234142 | 0.1906251 | 0 | 0 |
| wk251 | 22.6509078 | 0.1783918 | 0 | 30 |
| wk240 | 23.5145992 | 0.1445570 | 0 | 30 |
| wk228 | 14.9865455 | 0.1435981 | 0 | 30 |
| wk236 | 26.3738049 | 0.0825879 | 0 | 0 |
| wk216 | 8.1298059 | 0.0760767 | 0 | 0 |
| wk232 | 0.0847269 | 0.0734232 | 0 | 30 |
| wk252 | 58.8881359 | 0.0566079 | 0 | 30 |
| wk118 | 9.5054893 | 0.0345031 | 0 | 0 |
| wk604 | 0.7825628 | 0.0285853 | 0 | 0 |
| wk92 | 0.0023263 | 0.0023603 | 0 | 0 |
| wk101 | 0.0017559 | 0.0013574 | 0 | 0 |
| wk66 | 0.0000000 | 0.0000000 | 0 | 0 |
| wk88 | 0.0000000 | 0.0000000 | 0 | 0 |
5.2 Regression Fits
prod = as.data.frame(outmatrix) %>%
select(contains('prod')) %>%
gather(key=facility, value=value) %>%
mutate(which=parse_number(facility)) %>%
mutate(whp=data$whp_prod[which],
well = names(ids)[data$well_id_prod[which]]) %>%
rename(mf=value) %>%
group_by(well, whp) %>%
summarise(lower=quantile(mf, 0.025),
upper=quantile(mf, 0.975),
mean=mean(mf))
plotdata = regression_df %>%
filter(well_id %in% ids[production_curve_wells]) %>%
mutate(datetime = factor(as.Date(date))) %>%
mutate(source = factor(source, levels=c("Well Tests", "PI Database")))
# regression plot
regplot = ggplot(prod, aes(x=whp)) +
geom_line(aes(y=mean, color=well)) +
geom_ribbon(aes(ymin=lower, ymax=upper, fill=well), alpha=0.25) +
geom_point(data=plotdata, aes(y=mf, color=well, size=date, shape=source), alpha=0.5) +
labs(title="Linear Regression on Test and PI Data", x="Well-head pressure (bar)", y="Mass flow (T/h)", color="Well", shape="Data source", size="Date", fill="Well") +
coord_cartesian(xlim=c(min(plotdata$whp)*0.9,max(plotdata$whp)*1.1), ylim=c(0,max(plotdata$mf)*1.1))# +
# ggsave('../_media/production_curve.png', width=24.7*0.48, height=24.7*0.48, units='cm')
ggplotly(regplot)5.3 Time Series Plots
tsplotwells = ar_wells
ts_fit = as.data.frame(outmatrix) %>%
select(contains('mf_ts')) %>%
gather() %>%
mutate(index = parse_number(key)) %>% select(-key) %>%
group_by(index) %>%
summarise(lower=quantile(value, 0.025),
upper=quantile(value, 0.975),
mean=mean(value)) %>%
cbind(ts) %>%
mutate(well = factor(names(ids[well_id_ts])),
date_numeric = date_numeric_ts)
# actual observations
tsplotdata = dry_df %>%
filter(well_id %in% ids[tsplotwells]) %>%
mutate(datetime = factor(as.Date(date)),
facility = well)
# experimental AR1 time series
ar_fit = as.data.frame(outmatrix) %>%
select(contains("mu_ar")) %>%
gather() %>%
mutate(date_numeric = as.numeric(str_extract(key, "(?<=\\[)(.*?)(?=,)")) + min(dry_df$date_numeric) - 1,
facility = names(ids)[as.numeric(str_extract(key, "(?<=,)(.*?)(?=\\])"))]) %>%
select(facility, date_numeric, value) %>%
group_by(facility, date_numeric) %>%
summarise(mean=mean(value),
lower=quantile(value, 0.025),
upper=quantile(value, 0.975)) %>%
filter(facility %in% tsplotwells)
# experimental EMA time series
ewma_fit = as.data.frame(outmatrix) %>%
select(contains("mu_ema")) %>%
gather() %>%
mutate(date_numeric = as.numeric(str_extract(key, "(?<=\\[)(.*?)(?=,)")) + min(dry_df$date_numeric) - 1,
facility = names(ids)[as.numeric(str_extract(key, "(?<=,)(.*?)(?=\\])"))]) %>%
select(facility, date_numeric, value) %>%
group_by(facility, date_numeric) %>%
summarise(mean=mean(value),
lower=quantile(value, 0.025),
upper=quantile(value, 0.975)) %>%
filter(facility %in% tsplotwells)
# find plot limits
tsmax = max(c(ts_fit$upper, ar_fit$upper, ewma_fit$upper))
lintsplot = ggplot(ts_fit, aes(x=date_numeric, color=well, fill=well)) +
geom_line(aes(y=mean), linetype="dashed") +
geom_ribbon(aes(ymin=lower, ymax=upper), color=NA, alpha=0.25) +
geom_line(data=tsplotdata, aes(y=mf)) +
geom_vline(aes(xintercept = max(tsplotdata$date_numeric)), linetype="dashed", color="red") +
coord_cartesian(ylim=c(0, 60)) +
labs(title=paste("Linear Time Series Regression for Selected Wells in PI"), x="Days since baseline (2000)", linetype="")# +
# ggsave('../_media/dry_time_series.png', width=24.7, height=8, units='cm')
arplot = ggplot(ar_fit %>% filter(facility %in% tsplotwells), aes(x=date_numeric, y=mean, fill=facility, color=facility)) +
geom_line(data=tsplotdata, aes(y=mf)) +
geom_ribbon(aes(ymin=lower, ymax=upper), color=NA, alpha=0.5) +
geom_line(linetype="dashed") + coord_cartesian(ylim=c(0, 60)) +
geom_vline(aes(xintercept = max(tsplotdata$date_numeric)), linetype="dashed", color="red") +
labs(title="AR(1) Experiment", x="Days since first date", y="Mass flow (T/h)") +
ggsave('../_media/ar_experiment.png', width=24.7, height=8, units='cm')
ewmaplot = ggplot(ewma_fit, aes(x=date_numeric, y=mean, fill=facility, color=facility)) +
geom_line(data=tsplotdata, aes(y=mf)) +
geom_ribbon(aes(ymin=lower, ymax=upper), color=NA, alpha=0.5) +
geom_line(linetype="dashed") + coord_cartesian(ylim=c(0, 60)) +
geom_vline(aes(xintercept = max(tsplotdata$date_numeric)), linetype="dashed", color="red") +
labs(title="EWMA Experiment", x="Days since first date")# +
# ggsave('../_media/ewma_experiment.png', width=24.7, height=8, units='cm')
ggplotly(lintsplot)ggplotly(arplot)ggplotly(ewmaplot)tsgrob = grid_arrange_shared_legend(lintsplot, arplot, ewmaplot, nrow=3, ncol=1, position = "bottom")
tsgrob## TableGrob (2 x 1) "arrange": 2 grobs
## z cells name grob
## 1 1 (1-1,1-1) arrange gtable[arrange]
## 2 2 (2-2,1-1) arrange gtable[guide-box]
# ggsave('../_media/ts_experiment.png', tsgrob, width=24.7, height=24, units='cm')5.4 Goodness of fit (OLS regression)
liq_fit = as.data.frame(outmatrix) %>%
select(contains('mf_fit')) %>%
gather(key='index', value='fitted') %>%
mutate(index=as.integer(parse_number(index))) %>%
group_by(index) %>%
summarise(lower=quantile(fitted, 0.025),
upper=quantile(fitted, 0.975),
Fitted=mean(fitted),
std=sd(fitted)) %>%
cbind(regression_df) %>%
mutate(`Standardised residual` = (Fitted-mf)/std,
Well = factor(names(ids[well_id])),
Observed = mf) %>%
gather(key="key", value="value", `Standardised residual`, Observed) %>%
select(Well, key, Fitted, value, source)
diagplot = ggplot(liq_fit, aes(x=Fitted, y=value)) +
geom_point(aes(color=Well, shape=Well)) + scale_shape_manual(values = rep_len(1:25, length(unique(liq_fit$Well)))) +
geom_smooth(color='black') +
facet_wrap(~key, scales="free") +
geom_hline(data=data.frame(key="Standardised residual", value=c(1.96,-1.96)), aes(yintercept=value), color='red') +
geom_abline(data=data.frame(key="Observed", a = 1, b = 0), aes(slope = a, intercept=b), color='red') +
# coord_cartesian(ylim=c(-4, 4)) +
labs(title="Diagnostic Plots", x="Fitted mass flow (T/h)", y="") +
theme(legend.position = "bottom") +
guides(color=guide_legend(nrow=3, byrow=T), shape=guide_legend(nrow=3, byrow=T))# +
# ggsave('../_media/diagnostics.png', width=24.7, height=12, units='cm')
ggplotly(diagplot)## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
selectwells = liq_fit %>% group_by(Well, key) %>% summarise(fittedsd = sd(Fitted)) %>%
arrange(desc(fittedsd)) %>% head(56*2) %>% pull(Well)
observedplot = ggplot(liq_fit %>% filter(key=="Observed", Well %in% selectwells), aes(x=Fitted, y=value)) +
geom_point(aes(color=source), alpha=0.5) +
geom_smooth(color=NA, alpha=0.5) +
facet_wrap(~Well, scales="free") +
# geom_hline(data=data.frame(key="Standardised residual", value=c(1.96,-1.96)), aes(yintercept=value), color='red') +
geom_abline(data=data.frame(key="Observed", a = 1, b = 0), aes(slope = a, intercept=b)) +
labs(title="Linear Regression Fit Plots Per Well", x="Fitted mass flow (T/h)", y="Observed mass flow (T/h)", color="Data source") +
theme(legend.position = "bottom")# +
# guides(color=guide_legend(nrow=3, byrow=T), shape=guide_legend(nrow=3, byrow=T)) +
# ggsave('../_media/observed.png', width=24.7, height=24.7, units='cm')
stdresplot = ggplot(liq_fit %>% filter(key=="Standardised residual", Well %in% selectwells), aes(x=Fitted, y=value)) +
geom_point(aes(color=source), alpha=0.5) +
geom_smooth(color=NA, alpha=0.5) +
facet_wrap(~Well, scales="free_x") +
geom_hline(data=data.frame(key="Standardised residual", value=c(1.96,-1.96)), aes(yintercept=value), color='red') +
# geom_abline(data=data.frame(key="Observed", a = 1, b = 0), aes(slope = a, intercept=b), color='red') +
labs(title="Linear Regression Residual Plots Per Well", x="Fitted mass flow (T/h)", y="Standardised residual", color="Data source") +
coord_cartesian(ylim=c(-5, 5)) + theme(legend.position="bottom")# +
# guides(color=guide_legend(nrow=3, byrow=T), shape=guide_legend(nrow=3, byrow=T)) +
# ggsave('../_media/stdres.png', width=24.7, height=24.7, units='cm')
ggplotly(observedplot)## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
ggplotly(stdresplot)## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
# stdres_min = liq_fit %>% filter(key=="Standardised residual") %>% pull(value) %>% min()
# stdres_max = liq_fit %>% filter(key=="Standardised residual") %>% pull(value) %>% max()
# ggplot(liq_fit %>% filter(key=="Standardised residual"), aes(x=value)) +
# geom_density(fill="red", alpha=0.5, color=NA) +
# geom_line(data=data.frame(x=seq(stdres_min, stdres_max, length.out=100)), aes(x=x, y=dnorm(x)))5.5 Limits and Constraint Violations
sf.df <- outframe %>%
filter(str_detect(variable, "total_sf") & value > 0) %>%
droplevels()
limits = fp_constants %>%
mutate(facility = names(ids)[fp_id]) %>%
select(facility, limit) %>%
drop_na()
p.limits = sf.df %>%
left_join(limits, by=c("facility")) %>%
mutate(greater = value > limit) %>%
group_by(facility) %>%
summarise(p.greater = mean(greater)) %>%
drop_na()
limitplot = ggplot(sf.df %>% filter(facility %ni% incomplete.fps), aes(x=value, fill=facility)) +
facet_wrap(~facility, scales = "free_y", ncol=2) +
geom_density(alpha=0.5, color=NA) +
geom_vline(data=limits, aes(xintercept=limit), color="red") +
geom_label(data=p.limits %>% filter(facility %ni% incomplete.fps), aes(x=-Inf, y=Inf, hjust=0, vjust=1, label=paste0("p(>lim)=", p.greater), family="Times New Roman"), color="black", label.size=0, fill='white') +
theme(legend.position="none",
axis.text.y=element_blank(),
axis.ticks.y=element_blank()) +
labs(title="Posterior Flash Plant Mass Flows", x="Steam flow (T/h)", y="Density", fill="Flash plant", color="Steam flow limit")# +
# ggsave('../_media/constraints.png', width=24.7, height=10, units='cm')
ggplotly(limitplot)5.6 Flow Comparison
flow.df <- outframe %>%
filter(facility %in% fp_names) %>%
filter(str_detect(variable, "mf_pred|ip_sf|lp_sf|wf") & value > 0) %>%
mutate(variable=ifelse(variable=="mf_pred", "mf", variable),
variable=factor(variable, levels=c("mf", "ip_sf", "lp_sf", "wf")))
comparison = fp_constants %>% select("fp", contains("verification")) %>%
rename(facility=fp) %>%
gather(key="variable", value="value", -facility) %>%
mutate(variable = gsub("^verification_", "", variable),
variable=factor(variable, levels=c("mf", "ip_sf", "lp_sf", "wf"))) %>%
drop_na()
ps = flow.df %>%
left_join(comparison, by=c("facility", "variable")) %>%
mutate(greater = value.x > value.y) %>%
group_by(facility, variable) %>%
summarise(p.greater = mean(greater)) %>%
mutate(variable=factor(variable, levels=c("mf", "ip_sf", "lp_sf", "wf"))) %>%
drop_na()
verificationplot = ggplot(flow.df %>% filter(facility %ni% incomplete.fps), aes(x=value)) +
geom_density(aes(y=..scaled.., fill=variable, color=variable), alpha=0.5, show.legend=F) +
geom_vline(data=comparison %>% filter(facility %ni% incomplete.fps), aes(xintercept=value)) +
geom_label(data=ps %>% filter(facility %ni% incomplete.fps), aes(x=-Inf, y=Inf, hjust=0, vjust=1, label=paste0("p(>x)=", p.greater), family="Times New Roman"), label.size=0) +
facet_grid(facility~variable, scales="free", space="free_y") +
theme(axis.text.y=element_blank(),
axis.ticks.y=element_blank()) +
labs(x="Value", y="Scaled density", title="Comparison Between Predicted FP Flows and Sample Data")# +
# ggsave('../_media/verification.png', width=24.7, height=20, units='cm')
ggplotly(verificationplot)